載入套件
library(tidyverse)
## -- Attaching packages -------------------------------- tidyverse 1.2.1 --
## √ ggplot2 3.1.0 √ purrr 0.2.5
## √ tibble 1.4.2 √ dplyr 0.7.8
## √ tidyr 0.8.2 √ stringr 1.3.1
## √ readr 1.3.1 √ forcats 0.3.0
## -- Conflicts ----------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(nsprcomp)
library(ggfortify)head(USArrests,5)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6R的主成份分析套件:prcomp() - scale = TRUE:變數標準化
pca.model <- prcomp(USArrests,scale=TRUE)
pca.model
## Standard deviations (1, .., p=4):
## [1] 1.5748783 0.9948694 0.5971291 0.4164494
##
## Rotation (n x k) = (4 x 4):
## PC1 PC2 PC3 PC4
## Murder -0.5358995 0.4181809 -0.3412327 0.64922780
## Assault -0.5831836 0.1879856 -0.2681484 -0.74340748
## UrbanPop -0.2781909 -0.8728062 -0.3780158 0.13387773
## Rape -0.5434321 -0.1673186 0.8177779 0.08902432names(pca.model)
## [1] "sdev" "rotation" "center" "scale" "x"選擇多少個主成份 - 累積解釋比率圖
var.exp <- tibble(
#各主成分名稱
pc = paste0("PC_",format(1:4,width=2,flag=0)),
#求出每個主成份的特徵值(也就是variance = std^2)
var = pca.model$sdev^2,
#計算每個主成分的解釋比例 = 各個主成份的特徵值/總特徵值
prop = var / sum(var),
#累加每個主成份的解釋比例(aggregated effects)
cum_prop = cumsum(prop)
)
plot_ly(
x = var.exp$pc,
y = var.exp$var,
type = "bar"
) %>%
layout(
titl = "Variance Explained by Each Principal Component",
xaxis = list(type = 'Principal Component', tickangle = -60),
yaxis = list(title = 'Variance'),
margin = list(r = 30, t = 50, b = 70, l = 50)
)
plot_ly(
x = var.exp$pc,
y = var.exp$cum_prop,
type = "bar"
) %>%
layout(
titl = "Cumulative Proportion by Each Principal Component",
xaxis = list(type = 'Principal Component', tickangle = -60),
yaxis = list(title = 'Proportion'),
margin = list(r = 30, t = 50, b = 70, l = 50)
)
ggplot(melt(pca.model$rotation[,1:2]),aes(Var2, Var1)) +
geom_tile(aes(fill = value), colour = "white") +
scale_fill_gradient2(low = "firebrick4", high = "steelblue", mid = "white", midpoint = 0) +
guides(fill=guide_legend(title="Correlation")) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
axis.title = element_blank())
R的非負稀疏主成份分析套件:nsprcomp() - k = 非0係數個數(每個主成份期待非0係數個數*變數個數),nneg = TRUE(是否希望所有係數都非負)
nspca.model <- nsprcomp(USArrests, k=20, nneg=TRUE, scale=TRUE)
nspca.model
## Standard deviations (1, .., p=4):
## [1] 1.5748772 0.9002263 0.5393761 0.3033552
##
## Rotation (n x k) = (4 x 4):
## PC1 PC2 PC3 PC4
## Murder 0.5363066 0.00000000 0 0
## Assault 0.5835504 0.00000000 0 1
## UrbanPop 0.2786504 0.99767981 0 0
## Rape 0.5424004 0.06808079 1 0選擇多少個主成份 - 累積解釋比率圖
var.exp <- tibble(
#各主成分名稱
pc = paste0("PC_",format(1:4,width=2,flag=0)),
#求出每個主成份的特徵值(也就是variance = std^2)
var = nspca.model$sdev^2,
#計算每個主成分的解釋比例 = 各個主成份的特徵值/總特徵值
prop = var / sum(var),
#累加每個主成份的解釋比例(aggregated effects)
cum_prop = cumsum(prop)
)
plot_ly(
x = var.exp$pc,
y = var.exp$var,
type = "bar"
) %>%
layout(
titl = "Variance Explained by Each Principal Component",
xaxis = list(type = 'Principal Component', tickangle = -60),
yaxis = list(title = 'Variance'),
margin = list(r = 30, t = 50, b = 70, l = 50)
)
plot_ly(
x = var.exp$pc,
y = var.exp$cum_prop,
type = "bar"
) %>%
layout(
titl = "Cumulative Proportion by Each Principal Component",
xaxis = list(type = 'Principal Component', tickangle = -60),
yaxis = list(title = 'Proportion'),
margin = list(r = 30, t = 50, b = 70, l = 50)
)
ggplot(melt(nspca.model$rotation[,1:2]),aes(Var2, Var1)) +
geom_tile(aes(fill = value), colour = "white") +
scale_fill_gradient2(low = "firebrick4", high = "steelblue", mid = "white", midpoint = 0) +
guides(fill=guide_legend(title="Correlation")) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
axis.title = element_blank())
pca.score <- data.frame(nspca.model$x)
plot_ly(
x = pca.score[, 1],
y = pca.score[, 2],
text = row.names(USArrests),
type = "scatter",
mode = "text"
) %>% layout(
title = "PC 1 v.s. PC 2 Score: Scatter Plot",
xaxis = list(title = 'Principal Component 1'),
yaxis = list(title = 'Principal Component 2'),
margin = list(r = 30, t = 50, b = 70, l = 50)
)
PCbiplot <- function(PC, x="PC1", y="PC2", colors=c('black', 'black', 'red', 'red')) {
# PC being a prcomp object
data <- data.frame(obsnames=row.names(PC$x), PC$x)
plot <- ggplot(data, aes_string(x=x, y=y)) + geom_text(alpha=.4, size=3, aes(label=obsnames), color=colors[1])
# plot <- plot + geom_hline(aes(0), size=.2) + geom_vline(aes(0), size=.2, color=colors[2])
datapc <- data.frame(varnames=rownames(PC$rotation), PC$rotation)
mult <- min(
(max(data[,y]) - min(data[,y])/(max(datapc[,y])-min(datapc[,y]))),
(max(data[,x]) - min(data[,x])/(max(datapc[,x])-min(datapc[,x]))))
datapc <- transform(datapc,
v1 = .7 * mult * (get(x)),
v2 = .7 * mult * (get(y)))
plot <- plot + coord_equal() + geom_text(data=datapc, aes(x=v1, y=v2, label=varnames), size = 5, vjust=1, color=colors[3])
plot <- plot + geom_segment(data=datapc, aes(x=0, y=0, xend=v1, yend=v2), arrow=arrow(length=unit(0.2,"cm")), alpha=0.75, color=colors[4])
plot
}
PCbiplot(nspca.model, colors=c("black", "black", "red", "yellow"))